Code Background

Looking at whiff data for left-handed pitchers (35 pitchers from 2022, all with 1842+ seasonal pitches).


Libraries

library(tidyverse)
library(readxl)
library(knitr)
library(ranger)
library(glmnet)
library(forcats)
library(olsrr)
library(Metrics)
library(mgcv)
library(caret)
# Geom Zone (Jackie's) ####

geom_zone <- function(top = 11/3, bottom = 3/2, linecolor = "black"){
  geom_rect(xmin = -.7083, xmax = .7083, ymin = bottom, ymax = top,
            alpha = 0, color = linecolor, linewidth = 0.75)
}

# c(0, 0, -.25, -.5, -.25))

geom_plate <- function(pov = "pitcher"){
   df <- case_when(
     pov == "pitcher" ~ 
       data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, .25, .5, .25)),
     pov == "catcher" ~ 
       data.frame(x = c(-.7083, .7083, .7083 ,0, -.7083), y = c(0, 0, -.25, -.5, -.25))
   )
  
   g <- geom_polygon(data = df, aes(x = x, y = y), fill = "white", color = "black", linewidth = 1.25)
  g
}

Data

all <- read_csv("CSVs/all_pitches.csv") %>% 
  select(-...1) %>% 
  mutate(wOBAr = case_when(
    woba >= 0.370 ~ 6,
    woba >= 0.340 ~ 5,
    woba >= 0.310 ~ 4,
    woba >= 0.280 ~ 3,
    woba >= 0.250 ~ 2,
    woba >= 0.220 ~ 1)) %>% 
  mutate(wOBAr = as.factor(wOBAr))

whiff <- all %>% 
  mutate(whiff = description == "swinging_strike",
         whiff = as.character(whiff)) %>% 
  filter(pitch_type != "NA",
         pitch_type != "PO")

Variable Exploration by Whiff

# Pitch Speed
whiff %>% 
  ggplot(aes(y = whiff, x = pitch_speed, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Pitch Speed",
       x = "Pitch Speed (mph)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

# Horizontal Movement
whiff %>% 
  ggplot(aes(y = whiff, x = pfx_x*12, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Horizontal Movement",
       x = "Horizontal Movement (in.)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

# Vertical Movement
whiff %>% 
  ggplot(aes(y = whiff, x = pfx_z*12, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

# Horizontal Pitch Location
whiff %>% 
  ggplot(aes(y = whiff, x = plate_x, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Horizontal Pitch Location",
       x = "Horizontal Pitch Location (ft)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

# Vertical Pitch Location
whiff %>% 
  ggplot(aes(y = whiff, x = plate_z, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Pitch Location",
       x = "Vertical Pitch Location (ft)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

# Spin Rate
whiff %>% 
  ggplot(aes(y = whiff, x = release_spin_rate, color = pitch_name)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~pitch_name) +
  labs(title = "Whiff vs. Non-Whiff by Spin Rate",
       x = "Spin Rate (rpm)",
       y = "Outcome",
       color = "Pitch Type") +
  NULL

Logistic Regression

# Slider Logistic Model
whiff_sl <- whiff %>% 
  filter(pitch_type == "SL") %>% 
  mutate(whiff = str_replace(whiff, "TRUE", "1"),
         whiff = str_replace(whiff, "FALSE", "0"),
         whiff = as.numeric(whiff))

# Original Model
model1 <- glm(whiff ~ pitch_speed + spin_axis + pfx_x + pfx_z + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl, family = binomial)

# Reduced Model
model1 <- glm(whiff ~ pitch_speed + plate_x + plate_z +
              release_spin_rate + speed_change + break_change + pfx_total + distance,
              data = whiff_sl)

summary(model1)
## 
## Call:
## glm(formula = whiff ~ pitch_speed + plate_x + plate_z + release_spin_rate + 
##     speed_change + break_change + pfx_total + distance, data = whiff_sl)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.474e-02  5.943e-02  -0.248 0.804098    
## pitch_speed        2.416e-03  6.789e-04   3.559 0.000373 ***
## plate_x            1.720e-02  1.959e-03   8.782  < 2e-16 ***
## plate_z            3.157e-02  2.477e-03  12.747  < 2e-16 ***
## release_spin_rate  2.714e-05  7.244e-06   3.746 0.000180 ***
## speed_change       8.478e-03  1.604e-03   5.285 1.26e-07 ***
## break_change       4.843e-02  1.230e-02   3.937 8.25e-05 ***
## pfx_total         -2.533e-02  6.572e-03  -3.854 0.000116 ***
## distance          -1.078e-01  3.328e-03 -32.395  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.121298)
## 
##     Null deviance: 5646.6  on 45219  degrees of freedom
## Residual deviance: 5484.0  on 45211  degrees of freedom
##   (126 observations deleted due to missingness)
## AIC: 32948
## 
## Number of Fisher Scoring iterations: 2
preds <- whiff_sl %>% 
  mutate(prediction_log = predict(model1, whiff_sl),
         prediction = 1 / (1 + exp(-prediction_log)),
         rounded_pred = case_when(
           prediction >= 0.5 ~ 1,
           prediction < 0.5 ~ 0
         )) %>% 
  filter(!is.na(prediction))

preds %>% 
  ggplot(aes(x = as.character(whiff), y = prediction)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.1, width = 0.1, height = 0)

preds %>% 
  mutate(prediction = round(prediction, 2)) %>% 
  group_by(prediction) %>% 
  summarize(mean(whiff)) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = prediction, y = `mean(whiff)`)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(y = "observed whiff proportion",
       x = "projected % whiff chance",
       title = "Whiff proportion by predicted whiff value",
       subtitle = "Whiff predictions have a 1% bin width")

whiff %>% 
  mutate(count = paste0(balls, "-", strikes)) %>% 
  filter(pitch_type == "SL") %>% 
  ggplot(aes(y = whiff, x = pfx_z*12)) +
  geom_violin() +
  geom_boxplot(alpha = 0.5, width = 0.5) +
  facet_wrap(~count) +
  labs(title = "Whiff vs. Non-Whiff by Vertical Movement",
       x = "Induced Vertical Movement (in.)",
       y = "Outcome") +
  NULL

# Sliders
whiff %>% 
  filter(pitch_type =="SL") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Fastballs
whiff %>% 
  filter(pitch_type =="FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

# Change-Ups
whiff %>% 
  filter(pitch_type =="CH") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  # geom_point(alpha = 0.2) +
  coord_fixed() +
  facet_grid(cols = vars(hitter), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "SL",
         prev_pitch  %in% c("FF", "CH", "SL", "CU"),
         player_name == "Fried, Max") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(prev_pitch), rows = vars(whiff)) +
  theme_bw()

whiff %>% 
  arrange(game_date, player_name, at_bat_number, pitch_number) %>% 
  mutate(prev_pitch = lag(pitch_type, n = 1, default = NA)) %>% 
  filter(hitter == "R",
         pitch_type == "FF") %>% 
  ggplot(aes(x = -plate_x, y = plate_z)) + 
  geom_density_2d_filled(contour_var = "density", alpha = 0.5) +
  geom_zone() +
  coord_fixed() +
  facet_grid(cols = vars(wOBAr), rows = vars(whiff)) +
  theme_bw()

zoned <- whiff %>% 
  mutate(loc_x = round(plate_x*3, 0),
         loc_y = round(plate_z*3, 0))

zoned %>% 
  filter(pitch_type == "FF",
         plate_z > 0 & plate_z < 6,
         plate_x > -1.5 & plate_x < 1.5) %>% 
  summarize(whiff_perc = mean(whiff == "TRUE"),
            pitches = n(),
            .by = c(loc_x, loc_y, wOBAr)) %>% 
  filter(pitches >= 10) %>% 
  ggplot(aes(x = -loc_x, y = loc_y, fill = whiff_perc)) + 
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "red") +
  facet_wrap(~ wOBAr) +
  coord_fixed() +
  theme_bw()

all %>% 
  ggplot(aes(x = wOBAr, pitch_speed, color = wOBAr)) +
  geom_boxplot() +
  facet_wrap(~ pitch_name)

all %>% 
  ggplot(aes(x = estimated_woba_using_speedangle, pitch_speed)) +
  geom_point(alpha = 0.2) +
  facet_wrap(~ pitch_name)